Chapter 5 Community composition
5.1 Count data preparation
genome_counts_log <- genome_counts %>%
column_to_rownames(var="genome") %>%
mutate_all(~log10(.+1)) #fixed: mutate_at(vars(), ~log10(.+1))) was not working
genome_counts_pivot <- genome_counts %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
left_join(., genome_metadata, by = join_by(genome == genome))
# %>% #append taxonomy
# mutate(phylum = fct_relevel(phylum, rev(ehi_phylum_colors$phylum))) #sort phyla by taxonomy
genome_counts_by_host <- sample_metadata %>%
select("sample","species","area_type", "development") %>%
rename(host_sp=species) %>%
left_join(genome_counts_pivot,., by=join_by("sample" == "sample")) #%>%
# Which host species each genome can be found in
genomes_by_species <- genome_counts_by_host %>%
filter(count>0) %>%
group_by(genome) %>%
mutate(host = if_else(all(host_sp == "Sciurus vulgaris"), "only red",
if_else(all(host_sp == "Sciurus carolinensis"), "only grey", "both"))) %>%
select(genome, host) %>%
distinct(genome, .keep_all = TRUE) %>%
left_join(.,genome_metadata, by='genome')
genomes_by_species$host <-factor(genomes_by_species$host, levels = c("both", "only red", "only grey"))5.2 Genomes by host species
# Generate the phylum color heatmap
phylum_heatmap <- genome_metadata %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Create baseline circular genome tree
circular_tree <- force.ultrametric(genome_tree,method="extend") %>%
ggtree(., layout = 'circular', size = 0.1, angle=45) +
xlim(-1, NA)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum colors ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.0, width=0.3, colnames=FALSE) +
scale_fill_manual(values=custom_colors, name="Phylum") +
#geom_tiplab2(size=1, hjust=-0.1) +
theme(legend.position = "right", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
#Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add host ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_manual(values = c("black", "#ed2939", "#92a0ad"), name="Host\nspecies") + #"#cc3333", "#999999"
geom_fruit(
data=genomes_by_species,
geom=geom_tile,
mapping = aes(y=genome, fill=host),
offset = 0.55,
width=0.3)
#Plot circular tree
circular_tree#number of MAGs by host species
genomes_by_species %>%
dplyr::group_by(host) %>%
summarise(n=length(host),
percentage=(length(host)/1687)*100) %>%
tt()| host | n | percentage |
|---|---|---|
| both | 505 | 29.93480 |
| only red | 482 | 28.57143 |
| only grey | 700 | 41.49378 |
#most abundant phyla by host species
genome_counts_by_host %>%
filter(count>0) %>%
group_by(host_sp,phylum) %>%
summarise(rel_abundance=sum(count)) %>%
top_n(3, rel_abundance) %>%
arrange(by_group=host_sp, desc(rel_abundance)) %>%
tt()| host_sp | phylum | rel_abundance |
|---|---|---|
| Sciurus carolinensis | p__Bacillota_A | 55.472233 |
| Sciurus carolinensis | p__Bacteroidota | 16.149398 |
| Sciurus carolinensis | p__Bacillota | 2.498674 |
| Sciurus vulgaris | p__Bacillota_A | 48.125675 |
| Sciurus vulgaris | p__Bacteroidota | 32.438965 |
| Sciurus vulgaris | p__Actinomycetota | 16.517267 |
#most abundant class by host species
genome_counts_by_host %>%
filter(count>0) %>%
group_by(host_sp,class) %>%
summarise(total_count=sum(count)) %>%
top_n(5, total_count) %>%
arrange(by_group=host_sp, desc(total_count)) %>%
tt()| host_sp | class | total_count |
|---|---|---|
| Sciurus carolinensis | c__Clostridia | 55.472233 |
| Sciurus carolinensis | c__Bacteroidia | 16.149398 |
| Sciurus carolinensis | c__Bacilli | 2.498674 |
| Sciurus carolinensis | c__Verrucomicrobiae | 2.094198 |
| Sciurus carolinensis | c__Vampirovibrionia | 1.253040 |
| Sciurus vulgaris | c__Clostridia | 48.125675 |
| Sciurus vulgaris | c__Bacteroidia | 32.438965 |
| Sciurus vulgaris | c__Actinomycetia | 16.040176 |
| Sciurus vulgaris | c__Bacilli | 8.233587 |
| Sciurus vulgaris | c__Gammaproteobacteria | 2.254878 |
#most common class by host species
genome_counts_by_host %>%
#filter(count>0) %>%
group_by(host_sp,class) %>%
summarise(freq=sum(count>0)/n()) %>%
top_n(3, freq) %>%
arrange(by_group=host_sp, desc(freq)) %>%
tt()| host_sp | class | freq |
|---|---|---|
| Sciurus carolinensis | c__Campylobacteria | 0.4625000 |
| Sciurus carolinensis | c__Verrucomicrobiae | 0.3875000 |
| Sciurus carolinensis | c__Peptococcia | 0.3333333 |
| Sciurus vulgaris | c__Peptococcia | 0.1666667 |
| Sciurus vulgaris | c__Campylobacteria | 0.1454545 |
| Sciurus vulgaris | c__Negativicutes | 0.1352273 |
#most abundant family by host species
genome_counts_by_host %>%
filter(count>0) %>%
group_by(host_sp,family) %>%
summarise(abundance=sum(count)) %>%
top_n(5, abundance) %>%
arrange(by_group=host_sp, desc(abundance)) %>%
tt()| host_sp | family | abundance |
|---|---|---|
| Sciurus carolinensis | f__Lachnospiraceae | 29.253695 |
| Sciurus carolinensis | f__Oscillospiraceae | 16.103469 |
| Sciurus carolinensis | f__Bacteroidaceae | 9.370700 |
| Sciurus carolinensis | f__Muribaculaceae | 4.737998 |
| Sciurus carolinensis | f__Borkfalkiaceae | 2.527615 |
| Sciurus vulgaris | f__Lachnospiraceae | 35.793100 |
| Sciurus vulgaris | f__Bacteroidaceae | 21.149209 |
| Sciurus vulgaris | f__Bifidobacteriaceae | 15.986288 |
| Sciurus vulgaris | f__Muribaculaceae | 9.598616 |
| Sciurus vulgaris | f__Oscillospiraceae | 4.267142 |
##FIX!!!
#most common family by host species
genome_counts_by_host %>%
group_by(host_sp) %>%
mutate(n = length(sample)) %>%
ungroup() %>%
# Group by host species and family to count the number of counts > 0
group_by(host_sp, sample, family) %>%
summarise(
pos = sum(count > 0),
n = first(n),
freq = sum(count > 0) / first(n)
) %>%
# Select the top 3 families by frequency for each host species
group_by(host_sp) %>%
top_n(3, freq) %>%
# Sort the results
arrange(host_sp, desc(freq)) %>%
tt()| host_sp | sample | family | pos | n | freq |
|---|---|---|---|---|---|
| Sciurus carolinensis | EHI01467 | f__Lachnospiraceae | 161 | 134960 | 0.0011929461 |
| Sciurus carolinensis | EHI00382 | f__Oscillospiraceae | 142 | 134960 | 0.0010521636 |
| Sciurus carolinensis | EHI01480 | f__Lachnospiraceae | 138 | 134960 | 0.0010225252 |
| Sciurus vulgaris | EHI01458 | f__Lachnospiraceae | 91 | 185570 | 0.0004903810 |
| Sciurus vulgaris | EHI01512 | f__Lachnospiraceae | 91 | 185570 | 0.0004903810 |
| Sciurus vulgaris | EHI02304 | f__Lachnospiraceae | 71 | 185570 | 0.0003826049 |
***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors
vertical_tree <- gheatmap(vertical_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE, color=NA) +
scale_fill_manual(values=custom_colors)
#Reset fill scale
vertical_tree <- vertical_tree + new_scale_fill()
#Add counts
vertical_tree <- gheatmap(vertical_tree, genome_counts_log, offset=0.5, width=3.5, color=NA, colnames=FALSE) + #, colnames_angle=90, font.size=2, colnames_position="top", colnames_offset_y = 9
vexpand(.08) +
coord_cartesian(clip = "off") +
scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white")
#Plot tree
vertical_tree +
theme(legend.position='right')5.3 Taxonomic composition of samples
# sample_sort <- sample_metadata %>%
# arrange(Area_type) %>%
# select(sample) %>%
# pull()
# Plot stacked barplot
ggplot(genome_counts_by_host, aes(x=sample,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.02)+ #plot stacked bars with white borders
scale_fill_manual(values=custom_colors, name="Phylum") +
labs(y = "Relative abundance") +
guides(fill = guide_legend(ncol = 1)) +
facet_nested(~host_sp, scales="free", space="free") +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
legend.position="right",
)phylum_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,phylum) %>%
summarise(relabun=sum(count))
phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) # A tibble: 13 × 3
phylum mean sd
<chr> <dbl> <dbl>
1 p__Actinomycetota 0.0898 0.234
2 p__Bacillota 0.0565 0.110
3 p__Bacillota_A 0.545 0.252
4 p__Bacillota_B 0.00255 0.00418
5 p__Bacillota_C 0.00429 0.00664
6 p__Bacteroidota 0.256 0.178
7 p__Campylobacterota 0.00521 0.0250
8 p__Cyanobacteriota 0.00895 0.0118
9 p__Elusimicrobiota 0.00170 0.00694
10 p__Patescibacteria 0.0000251 0.000346
11 p__Pseudomonadota 0.0180 0.0738
12 p__Thermoplasmatota 0.000351 0.00366
13 p__Verrucomicrobiota 0.0116 0.0229
phylum_arrange <- phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun)) %>%
arrange(-mean) %>%
select(phylum) %>%
pull()
phylum_summary %>%
filter(phylum %in% phylum_arrange) %>%
mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
scale_color_manual(values=custom_colors, name="Phylum") +
geom_jitter(alpha=0.5) +
facet_nested(~host_sp, scales="free", space="free") +
theme_minimal() +
theme(legend.position="none") +
labs(y="Phylum",x="Relative abundance") family_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,family) %>%
summarise(relabun=sum(count))
family_summary %>%
group_by(family) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
arrange(-mean)# A tibble: 74 × 3
family mean sd
<chr> <dbl> <dbl>
1 f__Lachnospiraceae 0.342 0.192
2 f__Bacteroidaceae 0.161 0.146
3 f__Oscillospiraceae 0.107 0.104
4 f__Bifidobacteriaceae 0.0841 0.233
5 f__Muribaculaceae 0.0755 0.0956
6 f__Ruminococcaceae 0.0329 0.0377
7 f__Borkfalkiaceae 0.0192 0.0270
8 f__Lactobacillaceae 0.0163 0.0514
9 f__Streptococcaceae 0.0161 0.0866
10 f__Acutalibacteraceae 0.0109 0.0135
# ℹ 64 more rows
family_arrange <- family_summary %>%
group_by(family) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(family) %>%
pull()
family_summary %>%
left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
left_join(sample_metadata,by=join_by(sample==sample)) %>%
filter(family %in% family_arrange[1:20]) %>%
mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
scale_color_manual(values=custom_colors) +
geom_jitter(alpha=0.5) +
facet_grid(.~host_sp)+
theme_minimal() +
labs(y="Family", x="Relative abundance", color="Phylum") +
guides(colour = guide_legend(override.aes = list(size=4, alpha=0.9)))#NB: there are several unnamed genera from different families that get grouped together
genus_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,genus) %>%
summarise(relabun=sum(count))
genus_summary %>%
group_by(genus) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
arrange(-mean)# A tibble: 348 × 3
genus mean sd
<chr> <dbl> <dbl>
1 g__Bifidobacterium 0.0841 0.233
2 g__Prevotella 0.0732 0.0968
3 g__Acetatifactor 0.0429 0.0621
4 g__ 0.0383 0.0427
5 g__Eisenbergiella 0.0322 0.0556
6 g__Faecousia 0.0270 0.0366
7 g__CAG-95 0.0219 0.0326
8 g__Roseburia 0.0215 0.0526
9 g__Phocaeicola 0.0194 0.0292
10 g__Dysosmobacter 0.0179 0.0175
# ℹ 338 more rows
genus_arrange <- genus_summary %>%
group_by(genus) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(genus) %>%
pull()
genus_summary %>%
left_join(genome_metadata %>% select(genus,phylum) %>% unique(),by=join_by(genus==genus)) %>%
left_join(sample_metadata,by=join_by(sample==sample)) %>%
filter(genus %in% genus_arrange[1:20]) %>%
mutate(genus=factor(genus,levels=rev(genus_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
scale_color_manual(values=custom_colors) +
geom_jitter(alpha=0.5) +
facet_grid(.~host_sp)+
theme_minimal() +
labs(y="Genus", x="Relative abundance", color="Phylum") +
guides(colour = guide_legend(override.aes = list(size=5, alpha=0.9)))Warning in left_join(., genome_metadata %>% select(genus, phylum) %>% unique(), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 41 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.